In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression

In [None]:
def lm(x, y, data, intercept=True):
 """Returns the coefficients from regressing y on x.
 
 Inputs:
 - x: a list containing the names of the x variables
 - y: the name of the y variable
 - data: a Pandas data frame (the names in x and y must be columns in this data frame)
 - intercept: boolean indicating whether or not to include an intercept term
 
 Outputs: A Pandas series with the estimated coefficients, indexed by the x variable names.
 """
 
 if intercept:
 beta = [0] * (len(x) + 1)
 names = ["Intercept"] + x
 else:
 beta = [0] * len(x)
 names = x
 
 return pd.Series(data=beta, index=names)

## Some Data To Test Your Code

In [None]:
predictors = ["symboling", "normalized-losses", "make", "fuel-type",
 "aspiration", "num-of-doors", "body-style", "drive-wheels",
 "engine-location", "wheel-base", "length", "width",
 "height", "curb-weight", "engine-type", "num-of-cylinders",
 "engine-size", "fuel-system", "bore", "stroke",
 "compression-ratio", "horsepower", "peak-rpm", "city-mpg",
 "highway-mpg"]
data = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
 header=None,
 names=predictors + ["price"])

The following code strips out missing values (represented by "?" in this data set) and converts columns to numeric types before fitting linear regression to the data.

In [None]:
print(data.shape)

for col in data.columns:
 if data[col].dtype == object:
 data = data[data[col] != "?"]
 try:
 data[col] = pd.to_numeric(data[col])
 except:
 pass
 
print(data.shape)

## Test 1: Quantitative Predictors Only

Let's test out the `lm` function you just wrote on some quantitative predictors.

In [None]:
lm(["length", "width", "height"], "price", data)

Check that your `lm` function produces the same results as scikit-learn.

In [None]:
model = LinearRegression()
model.fit(data[["length", "width", "height"]], data["price"])
model.intercept_, model.coef_

## Test 2: Categorical Predictors

Your `lm` function should also do the right thing for categorical variables automatically (i.e., it should expand categorical variables with $k$ levels into $k-1$ 0-1 variables automatically).

In [None]:
lm(predictors, "price", data)

Check that your `lm` function produces the same results as scikit-learn.

In [None]:
model = LinearRegression()
data_expanded = pd.get_dummies(data[predictors], drop_first=True)
model.fit(data_expanded, data["price"])
model.intercept_, model.coef_